This collection of notebooks will explore several anomaly detection techniques applied to a simple dataset.
url = https://www.kaggle.com/datasets/nphantawee/pump-sensor-data
The data are from all available sensor, all of them are raw value. Total sensors are 52.
In the first Notebook we will perform the ordinary EDA analysis and we will prepare the data for further analysis.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid", font_scale=1.5)
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 8]
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)
sns.set_theme()
# Read data
pump_df = pd.read_csv('./pump_data_set/sensor.csv')
pump_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 220320 entries, 0 to 220319 Data columns (total 55 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 220320 non-null int64 1 timestamp 220320 non-null object 2 sensor_00 210112 non-null float64 3 sensor_01 219951 non-null float64 4 sensor_02 220301 non-null float64 5 sensor_03 220301 non-null float64 6 sensor_04 220301 non-null float64 7 sensor_05 220301 non-null float64 8 sensor_06 215522 non-null float64 9 sensor_07 214869 non-null float64 10 sensor_08 215213 non-null float64 11 sensor_09 215725 non-null float64 12 sensor_10 220301 non-null float64 13 sensor_11 220301 non-null float64 14 sensor_12 220301 non-null float64 15 sensor_13 220301 non-null float64 16 sensor_14 220299 non-null float64 17 sensor_15 0 non-null float64 18 sensor_16 220289 non-null float64 19 sensor_17 220274 non-null float64 20 sensor_18 220274 non-null float64 21 sensor_19 220304 non-null float64 22 sensor_20 220304 non-null float64 23 sensor_21 220304 non-null float64 24 sensor_22 220279 non-null float64 25 sensor_23 220304 non-null float64 26 sensor_24 220304 non-null float64 27 sensor_25 220284 non-null float64 28 sensor_26 220300 non-null float64 29 sensor_27 220304 non-null float64 30 sensor_28 220304 non-null float64 31 sensor_29 220248 non-null float64 32 sensor_30 220059 non-null float64 33 sensor_31 220304 non-null float64 34 sensor_32 220252 non-null float64 35 sensor_33 220304 non-null float64 36 sensor_34 220304 non-null float64 37 sensor_35 220304 non-null float64 38 sensor_36 220304 non-null float64 39 sensor_37 220304 non-null float64 40 sensor_38 220293 non-null float64 41 sensor_39 220293 non-null float64 42 sensor_40 220293 non-null float64 43 sensor_41 220293 non-null float64 44 sensor_42 220293 non-null float64 45 sensor_43 220293 non-null float64 46 sensor_44 220293 non-null float64 47 sensor_45 220293 non-null float64 48 sensor_46 220293 non-null float64 49 sensor_47 220293 non-null float64 50 sensor_48 220293 non-null float64 51 sensor_49 220293 non-null float64 52 sensor_50 143303 non-null float64 53 sensor_51 204937 non-null float64 54 machine_status 220320 non-null object dtypes: float64(52), int64(1), object(2) memory usage: 92.5+ MB
From the .info() method we appreciate that there exist columns like 'Unnamed: 0' which duplicate the index from pandas, also 'sensor_15' has no data
pump_df.drop(["sensor_15", "Unnamed: 0"], inplace=True, axis=1)
# Check missing values
print(pump_df.isnull().sum().sort_values(ascending=False))
# Convert column timestamp to datetime
pump_df['timestamp'] = pd.to_datetime(pump_df.timestamp)
pump_df.set_index('timestamp',inplace =True)
# Convert Machine Status to Categrorical, for efficiency and usage in statistical plots
pump_df['machine_status'] = pump_df.machine_status.astype('category')
sensor_50 77017 sensor_51 15383 sensor_00 10208 sensor_07 5451 sensor_08 5107 sensor_06 4798 sensor_09 4595 sensor_01 369 sensor_30 261 sensor_29 72 sensor_32 68 sensor_17 46 sensor_18 46 sensor_22 41 sensor_25 36 sensor_16 31 sensor_49 27 sensor_48 27 sensor_47 27 sensor_46 27 sensor_45 27 sensor_44 27 sensor_43 27 sensor_42 27 sensor_41 27 sensor_40 27 sensor_39 27 sensor_38 27 sensor_14 21 sensor_26 20 sensor_03 19 sensor_10 19 sensor_13 19 sensor_12 19 sensor_11 19 sensor_05 19 sensor_04 19 sensor_02 19 sensor_36 16 sensor_37 16 sensor_28 16 sensor_27 16 sensor_31 16 sensor_35 16 sensor_24 16 sensor_23 16 sensor_34 16 sensor_21 16 sensor_20 16 sensor_19 16 sensor_33 16 timestamp 0 machine_status 0 dtype: int64
# Delete the sensors (columns) with more than 1000 measures lost
# (This is just a criteria which can be replaced)
threshold = 1000
pump_df = pump_df[pump_df.columns[pump_df.isnull().sum() < threshold]]
pump_df.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 220320 entries, 2018-04-01 00:00:00 to 2018-08-31 23:59:00 Data columns (total 45 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sensor_01 219951 non-null float64 1 sensor_02 220301 non-null float64 2 sensor_03 220301 non-null float64 3 sensor_04 220301 non-null float64 4 sensor_05 220301 non-null float64 5 sensor_10 220301 non-null float64 6 sensor_11 220301 non-null float64 7 sensor_12 220301 non-null float64 8 sensor_13 220301 non-null float64 9 sensor_14 220299 non-null float64 10 sensor_16 220289 non-null float64 11 sensor_17 220274 non-null float64 12 sensor_18 220274 non-null float64 13 sensor_19 220304 non-null float64 14 sensor_20 220304 non-null float64 15 sensor_21 220304 non-null float64 16 sensor_22 220279 non-null float64 17 sensor_23 220304 non-null float64 18 sensor_24 220304 non-null float64 19 sensor_25 220284 non-null float64 20 sensor_26 220300 non-null float64 21 sensor_27 220304 non-null float64 22 sensor_28 220304 non-null float64 23 sensor_29 220248 non-null float64 24 sensor_30 220059 non-null float64 25 sensor_31 220304 non-null float64 26 sensor_32 220252 non-null float64 27 sensor_33 220304 non-null float64 28 sensor_34 220304 non-null float64 29 sensor_35 220304 non-null float64 30 sensor_36 220304 non-null float64 31 sensor_37 220304 non-null float64 32 sensor_38 220293 non-null float64 33 sensor_39 220293 non-null float64 34 sensor_40 220293 non-null float64 35 sensor_41 220293 non-null float64 36 sensor_42 220293 non-null float64 37 sensor_43 220293 non-null float64 38 sensor_44 220293 non-null float64 39 sensor_45 220293 non-null float64 40 sensor_46 220293 non-null float64 41 sensor_47 220293 non-null float64 42 sensor_48 220293 non-null float64 43 sensor_49 220293 non-null float64 44 machine_status 220320 non-null category dtypes: category(1), float64(44) memory usage: 75.9 MB
# Replace rest of nan with the mean value
pump_df.fillna(pump_df.mean(), inplace=True)
pump_df.isnull().sum()
sensor_01 0 sensor_02 0 sensor_03 0 sensor_04 0 sensor_05 0 sensor_10 0 sensor_11 0 sensor_12 0 sensor_13 0 sensor_14 0 sensor_16 0 sensor_17 0 sensor_18 0 sensor_19 0 sensor_20 0 sensor_21 0 sensor_22 0 sensor_23 0 sensor_24 0 sensor_25 0 sensor_26 0 sensor_27 0 sensor_28 0 sensor_29 0 sensor_30 0 sensor_31 0 sensor_32 0 sensor_33 0 sensor_34 0 sensor_35 0 sensor_36 0 sensor_37 0 sensor_38 0 sensor_39 0 sensor_40 0 sensor_41 0 sensor_42 0 sensor_43 0 sensor_44 0 sensor_45 0 sensor_46 0 sensor_47 0 sensor_48 0 sensor_49 0 machine_status 0 dtype: int64
pump_df.shape
(220320, 45)
pump_df.to_csv('pump_data_cleaned.csv')
pump_df.machine_status.value_counts()
NORMAL 205836 RECOVERING 14477 BROKEN 7 Name: machine_status, dtype: int64
There are 7 pumps declared as broken, let´s plot the sensor signals overlapped to the days were the failures took place
# Extract the readings from the BROKEN state of the pump
broken = pump_df[pump_df['machine_status']=='BROKEN']
normal = pump_df[pump_df['machine_status']=='NORMAL']
# Plot and compare the signals of 4 sensors
for name in pump_df.columns[0:44]:
_ = plt.figure(figsize=(18,5))
_ = plt.vlines(broken.index,0,pump_df[name].max(), linestyle='solid', color='red')
_ = plt.plot(broken[name], linestyle='none', marker='X', color='red', markersize=12)
_ = plt.plot(normal[name], linestyle='none', marker='o', color='g', markersize=6)
_ = plt.plot(pump_df[name], color='blue')
_ = plt.title(name)
plt.show()
Within the plots, it can be appreciated that not all sensors exhibit spikes on the failure dates, so, it is important to select which ones are most important to detect the failures in advance. We will tackle this later with a PCA Analysis.
from sklearn.preprocessing import StandardScaler
sensors = pump_df.drop('machine_status',axis = 1)
scaler = StandardScaler()
sensors_scaled = scaler.fit_transform(sensors)
sensors_scaled = pd.DataFrame(sensors_scaled, columns=sensors.columns)
sensors_scaled.head()
| sensor_01 | sensor_02 | sensor_03 | sensor_04 | sensor_05 | sensor_10 | sensor_11 | sensor_12 | sensor_13 | sensor_14 | ... | sensor_40 | sensor_41 | sensor_42 | sensor_43 | sensor_44 | sensor_45 | sensor_46 | sensor_47 | sensor_48 | sensor_49 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.151675 | 0.639386 | 1.057675 | 0.303443 | 0.177097 | -0.350860 | 0.429379 | 0.195797 | -0.782084 | 0.377336 | ... | 0.080880 | -0.553995 | -0.358970 | -0.176799 | -0.260520 | 1.759633 | 0.185888 | -0.588642 | 0.086297 | 0.553138 |
| 1 | -0.151675 | 0.639386 | 1.057675 | 0.303443 | 0.177097 | -0.350860 | 0.429379 | 0.195797 | -0.782084 | 0.377336 | ... | 0.080880 | -0.553995 | -0.358970 | -0.176799 | -0.260520 | 1.759633 | 0.185888 | -0.588642 | 0.086297 | 0.553138 |
| 2 | -0.072613 | 0.639386 | 1.093565 | 0.334786 | 0.008647 | -0.297906 | 0.479396 | 0.291884 | -0.778154 | 0.388584 | ... | 0.032135 | -0.619939 | -0.358970 | -0.200379 | -0.285516 | 1.737092 | 0.204388 | -0.588641 | 0.061668 | 0.522906 |
| 3 | -0.151675 | 0.627550 | 1.093564 | 0.260045 | 0.207693 | -0.239029 | 0.516072 | 0.250679 | -0.796852 | 0.387713 | ... | 0.153997 | -0.619939 | -0.384354 | -0.271121 | -0.310513 | 1.692010 | 0.204388 | -0.588642 | 0.061668 | 0.507790 |
| 4 | -0.138499 | 0.639386 | 1.093564 | 0.317909 | 0.184568 | -0.163810 | 0.547239 | 0.278346 | -0.781725 | 0.380144 | ... | 0.373349 | -0.553995 | -0.384354 | -0.223959 | -0.335509 | 1.714550 | 0.241389 | -0.533219 | 0.089816 | 0.492674 |
5 rows × 44 columns
# The target is a categorical variable with 3 possible values.
# We can use either OrdinalEncoder or OneHorEncoder from Scikit learn to process it properly
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
target_one_hot = ord_encoder.fit_transform(pump_df.machine_status.values.reshape(-1,1))
target_one_hot
array([[1.],
[1.],
[1.],
...,
[1.],
[1.],
[1.]])
sensors_scaled['target_one_hot'] = target_one_hot
sensors_scaled.to_csv('sensors_scaled.csv')
sensors_scaled.head()
| sensor_01 | sensor_02 | sensor_03 | sensor_04 | sensor_05 | sensor_10 | sensor_11 | sensor_12 | sensor_13 | sensor_14 | ... | sensor_41 | sensor_42 | sensor_43 | sensor_44 | sensor_45 | sensor_46 | sensor_47 | sensor_48 | sensor_49 | target_one_hot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.151675 | 0.639386 | 1.057675 | 0.303443 | 0.177097 | -0.350860 | 0.429379 | 0.195797 | -0.782084 | 0.377336 | ... | -0.553995 | -0.358970 | -0.176799 | -0.260520 | 1.759633 | 0.185888 | -0.588642 | 0.086297 | 0.553138 | 1.0 |
| 1 | -0.151675 | 0.639386 | 1.057675 | 0.303443 | 0.177097 | -0.350860 | 0.429379 | 0.195797 | -0.782084 | 0.377336 | ... | -0.553995 | -0.358970 | -0.176799 | -0.260520 | 1.759633 | 0.185888 | -0.588642 | 0.086297 | 0.553138 | 1.0 |
| 2 | -0.072613 | 0.639386 | 1.093565 | 0.334786 | 0.008647 | -0.297906 | 0.479396 | 0.291884 | -0.778154 | 0.388584 | ... | -0.619939 | -0.358970 | -0.200379 | -0.285516 | 1.737092 | 0.204388 | -0.588641 | 0.061668 | 0.522906 | 1.0 |
| 3 | -0.151675 | 0.627550 | 1.093564 | 0.260045 | 0.207693 | -0.239029 | 0.516072 | 0.250679 | -0.796852 | 0.387713 | ... | -0.619939 | -0.384354 | -0.271121 | -0.310513 | 1.692010 | 0.204388 | -0.588642 | 0.061668 | 0.507790 | 1.0 |
| 4 | -0.138499 | 0.639386 | 1.093564 | 0.317909 | 0.184568 | -0.163810 | 0.547239 | 0.278346 | -0.781725 | 0.380144 | ... | -0.553995 | -0.384354 | -0.223959 | -0.335509 | 1.714550 | 0.241389 | -0.533219 | 0.089816 | 0.492674 | 1.0 |
5 rows × 45 columns
from pandas.plotting import scatter_matrix
scatter_matrix(sensors_scaled.iloc[:,0:4], figsize = (12,7))
array([[<AxesSubplot: xlabel='sensor_01', ylabel='sensor_01'>,
<AxesSubplot: xlabel='sensor_02', ylabel='sensor_01'>,
<AxesSubplot: xlabel='sensor_03', ylabel='sensor_01'>,
<AxesSubplot: xlabel='sensor_04', ylabel='sensor_01'>],
[<AxesSubplot: xlabel='sensor_01', ylabel='sensor_02'>,
<AxesSubplot: xlabel='sensor_02', ylabel='sensor_02'>,
<AxesSubplot: xlabel='sensor_03', ylabel='sensor_02'>,
<AxesSubplot: xlabel='sensor_04', ylabel='sensor_02'>],
[<AxesSubplot: xlabel='sensor_01', ylabel='sensor_03'>,
<AxesSubplot: xlabel='sensor_02', ylabel='sensor_03'>,
<AxesSubplot: xlabel='sensor_03', ylabel='sensor_03'>,
<AxesSubplot: xlabel='sensor_04', ylabel='sensor_03'>],
[<AxesSubplot: xlabel='sensor_01', ylabel='sensor_04'>,
<AxesSubplot: xlabel='sensor_02', ylabel='sensor_04'>,
<AxesSubplot: xlabel='sensor_03', ylabel='sensor_04'>,
<AxesSubplot: xlabel='sensor_04', ylabel='sensor_04'>]],
dtype=object)
In the reduced scatter matrix quite a few relationhips can be infered at first glance, sensor_02 has a linear relation with sensor_04.
In order to reduce dimensionality a PCA analysis is required.
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(sensors_scaled)
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
# Plot the principal components
features = range(pca.n_components_)
_ = plt.figure(figsize=(22, 12))
_ = plt.bar(features, pca.explained_variance_, label = 'Individual Explained Variance')
_ = plt.step(features, np.cumsum(pca.explained_variance_), label = 'Cumulative Explained Variance')
_ = plt.xlabel('PCA feature')
_ = plt.ylabel('Variance')
_ = plt.xticks(features)
_ = plt.title("Important Principal Components")
plt.show()
# Calculate PCA with 2 components
pca_2 = PCA(n_components=2)
pComponents_2 = pca_2.fit_transform(sensors_scaled)
principal_df_2 = pd.DataFrame(data = pComponents_2, columns = ['pc1', 'pc2'])
principal_df_2.head()
| pc1 | pc2 | |
|---|---|---|
| 0 | -0.051565 | 0.385080 |
| 1 | -0.051565 | 0.385080 |
| 2 | -0.191430 | 0.433948 |
| 3 | -0.193255 | 0.413210 |
| 4 | -0.149014 | 0.553438 |
pca_0p95 = PCA()# give the components which contain 0.95% of the variance
pca_0p95.fit(sensors_scaled)
cumsum = np.cumsum(pca_0p95.explained_variance_)
dims = np.argmax(cumsum >=0.95) + 1
dims
1
pca_0p90 = PCA(n_components=0.90)# give the components which contain 75% of the variance
sensors_scaled_reduced_0p90 = pca_0p90.fit_transform(sensors_scaled)
sensors_scaled_reduced_0p90.shape
(220320, 13)
pca_0p75 = PCA(n_components=0.75)# give the components which contain 75% of the variance
sensors_scaled_reduced_0p75 = pca_0p75.fit_transform(sensors_scaled)
sensors_scaled_reduced_0p75.shape
# With 5 sensors we capture 75% of the variance
(220320, 5)
sensorsdf_red_0p75= pd.DataFrame(data = sensors_scaled_reduced_0p75,
columns = ['pc1', 'pc2','pc3','pc4','pc5'])
# These are the orthogonal projections of the 5 main components with a 75% of variance.
sensorsdf_red_0p75.head()
| pc1 | pc2 | pc3 | pc4 | pc5 | |
|---|---|---|---|---|---|
| 0 | -0.051565 | 0.385080 | -0.594502 | -0.867637 | 2.235067 |
| 1 | -0.051565 | 0.385080 | -0.594502 | -0.867637 | 2.235067 |
| 2 | -0.191430 | 0.433948 | -0.603835 | -0.983176 | 2.338037 |
| 3 | -0.193255 | 0.413210 | -0.600183 | -1.084409 | 2.319278 |
| 4 | -0.149014 | 0.553438 | -0.492572 | -1.138417 | 2.232818 |
sensorsdf_red_0p75.to_csv('sensorsdf_red_pca_0p75.csv')
from statsmodels.tsa.stattools import adfuller
# Run Augmented Dickey Fuller Test
result = adfuller(sensorsdf_red_0p75['pc1'])
# Print p-value
print(result[1])
0.00014188450768720347
# Autocorrelation
# Compute change in daily mean
pca1 = sensorsdf_red_0p75['pc1'].pct_change()
# Compute autocorrelation
autocorrelation = pca1.dropna().autocorr()
print('Autocorrelation is: ', autocorrelation)
Autocorrelation is: -3.421205808953816e-05
# Plot ACF
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(pca1.dropna(), lags=20, alpha=0.05)
plt.show()
# Plot ACF second component
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(sensorsdf_red_0p75['pc2'].pct_change().dropna(), lags=20, alpha=0.05)
plt.show()